# Replication code for Taylor C. Boas, Dino P. Christenson, and David M. Glick, "Recruiting Large Online Samples in the United States and India: Facebook, Mechanical Turk and Qualtrics," Political Science Research and Methods.

# Analysis conducted in R 3.4.3 on MacOS 10.13.2

# NOTE: This file merges external geographic data into the cleaned and combined U.S. survey data file and the CCES survey that we use as a benchmark. It also calculates population weights for the U.S. survey data. Files should be run in the following order; please see readme.txt for details.
# 	1. clean_us_survey.R
# 	2. clean_india_survey.R
# 	3. merge_external_data_us.R
# 	4. merge_external_data_india.R
# 	5. analyze_demographics.R
# 	6. analyze_spaces.R
# 	7. analyze_politics.R
# 	8. analyze_cooperativeness.R
# 	9. analyze_experiments.R

# Set working directory as appropriate
# setwd('~/Dropbox/sample recruitment shared/replication/')

# Clean desktop and load packages. Please make sure all necessary packages are installed.
rm(list=ls(all=T))
library(openxlsx)
library(foreign)

# Load data files; save variable names
load('us.RData')
varnames.us<-names(us)
load('cces.RData')

#################################################################
# Merge US county data with ZIP codes in our survey data and CCES
#################################################################

# Load external data files

zip_fips<-read.xlsx('ZipToCounty.xlsx',sheet=2) # From https://www.huduser.gov/portal/datasets/usps_crosswalk.html
county_codes<-read.xlsx('ruralurbancodes2013.xlsx',sheet=1) # From http://www.ers.usda.gov/data-products/rural-urban-continuum-codes/.aspx
county_data<-read.delim('2014_Gaz_counties_national.txt',colClasses='character') # From https://www.census.gov/geo/maps-data/data/gazetteer2014.html

# Step 1: merge into our survey data

# Reduce zip_fips to those in our data. Merge in rural-urban continuum code, population, and land area, and calculate population density.
zip_fips1<-zip_fips[zip_fips$ZIP %in% us$Q28,]
zip_fips1<-merge(zip_fips1, county_codes, by.x='COUNTY',by.y='FIPS')
zip_fips1<-merge(zip_fips1, county_data, by.x='COUNTY',by.y='GEOID')
zip_fips1$density<-zip_fips1$Population_2010/(as.numeric(zip_fips1$ALAND)/1000000) # People per square kilometer (dividing by 1,000,000 since land area data is in square meters)

# Aggregate to ZIP level. For ZIPs split between more than one county, calculate average population, population density, and rural-urban continuum code, weighted by TOT_RATIO. RES_RATIO (ratio of residential addresses in that ZIP and County to total residential addresses in that ZIP) would make more sense, but we get some zeros here, which would indicate no residential addresses, which makes no sense.

zips1<-with(zip_fips1,aggregate(list(pop=Population_2010* TOT_RATIO,rucc=RUCC_2013* TOT_RATIO,density=density*TOT_RATIO),list(zip=ZIP),sum))

# Merge into survey data

us<-merge(us,zips1,by.x='Q28',by.y='zip',all.x=T)

# Step 1: merge into CCES data

# Reduce zip_fips to those in CCES data. Merge in rural-urban continuum code, population, and land area, and calculate population density.

zip_fips2<-zip_fips[zip_fips$ZIP %in% cces$lookupzip,]
zip_fips2<-merge(zip_fips2, county_codes, by.x='COUNTY',by.y='FIPS')
zip_fips2<-merge(zip_fips2, county_data, by.x='COUNTY',by.y='GEOID')
zip_fips2$density<-zip_fips2$Population_2010/(as.numeric(zip_fips2$ALAND)/1000000) # People per square kilometer (dividing by 1,000,000 since land area data is in square meters)

# Aggregate to ZIP level. For ZIPs split between more than one county, calculate average population, population density, and rural-urban continuum code, weighted by TOT_RATIO. RES_RATIO (ratio of residential addresses in that ZIP and County to total residential addresses in that ZIP) would make more sense, but we get some zeros here, which would indicate no residential addresses, which makes no sense.

zips2<-with(zip_fips2,aggregate(list(pop=Population_2010* TOT_RATIO,rucc=RUCC_2013* TOT_RATIO,density=density*TOT_RATIO),list(zip=ZIP),sum))

# Merge into CCES data, save

cces<-merge(cces,zips2,by.x='lookupzip',by.y='zip',all.x=T)
save(cces, file='cces_augmented.RData')

##########################################
# Calculate weights for each US sample
# based on region (4 categories), age
# (4 categories), and sex 
##########################################

# Samples for calculating weights: completed surveys only

us1<-us[us$Finished==1,]

# Read and format US population data. Restrict to ages 18+

us_pop<-lapply(1:4,function(x) read.xlsx('AgeSexRegion.xlsx',sheet=x,startRow=7,skipEmptyRows=T,colNames=F)) # From http://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=DEC_10_SF1_QTP2&prodType=table
us_pop<-lapply(us_pop,function(x) x[!is.na(x$X1),c('X1','X4','X5')])
us_pop<-lapply(us_pop,function(x) apply(x,2,function(y) gsub(',','',y)))
us_pop<-lapply(us_pop,function(x) apply(x,2,function(y) gsub(' years','',y)))
us_pop<-lapply(us_pop,function(x) apply(x,2,function(y) gsub('^ *','',y)))
us_pop<-lapply(us_pop,function(x) apply(x,2,as.numeric))
us_pop<-lapply(us_pop,function(x) rbind(x,c(100,apply(x[(nrow(x)-2):(nrow(x)),-1],2,sum)))) #Sum the three over-100 categories
us_pop<-lapply(us_pop,function(x) x[which(x[,1] >= 18),]) #18 and over only 
us_pop<-lapply(us_pop,data.frame)
us_pop[[1]]$region<-'West'
us_pop[[2]]$region<-'South'
us_pop[[3]]$region<-'Northeast'
us_pop[[4]]$region<-'Midwest'
us_pop<-do.call(rbind,us_pop)
names(us_pop)<-c('age','male','female','region')

# Calculating quartiles of age

us_pop_nat<-aggregate(list(male=us_pop$male,female=us_pop$female),list(age=us_pop$age),sum)
us_cumpct<-data.frame(age=18:100, cumpct=100*cumsum(us_pop_nat$male+ us_pop_nat$female)/sum(us_pop_nat$male+ us_pop_nat$female)) # 31, 45, 59
us_quartiles<-sapply(c(25,50,75),function(x) us_cumpct$age[which(us_cumpct$cumpct == min(us_cumpct$cumpct[us_cumpct$cumpct >= x]))])

# Checking region, age, and sex distribution of samples. Start by defining variables region and age range for the sample.

us1$region<-NA
us1$region[us1$Q27 %in% c(3,6,13,27,29,32,46,52,2,5,12,38,49)]<-'West'
us1$region[us1$Q27 %in% c(8,9,10,11,21,34,42,48,50,1,18,25,44,4,19,37,45)]<-'South'
us1$region[us1$Q27 %in% c(7,20,22,30,41,47,31,33,39)]<-'Northeast'
us1$region[us1$Q27 %in% c(14,15,23,36,51,16,17,24,26,28,35,43)]<-'Midwest'

us1$age.range<-1*(as.numeric(us1$Q3)<us_quartiles[1])+2*(as.numeric(us1$Q3) >= us_quartiles[1] & as.numeric(us1$Q3) < us_quartiles[2])+3*(as.numeric(us1$Q3) >= us_quartiles[2] & as.numeric(us1$Q3) < us_quartiles[3])+4*(as.numeric(us1$Q3)>=us_quartiles[3])

# Calculate weights

us_pop$age.range<-1*(as.numeric(us_pop$age)<us_quartiles[1])+2*(as.numeric(us_pop$age) >= us_quartiles[1] & as.numeric(us_pop$age) < us_quartiles[2])+3*(as.numeric(us_pop$age) >= us_quartiles[2] & as.numeric(us_pop$age) < us_quartiles[3])+4*(as.numeric(us_pop$age)>=us_quartiles[3])
us_pop_agg<-aggregate(list(male=us_pop$male,female=us_pop$female),list(region=us_pop$region,age.range=us_pop$age.range),sum)
us_pop_agg_male<-data.frame(region=us_pop_agg$region,age.range=us_pop_agg$age.range,male=T,pop=us_pop_agg$male)
us_pop_agg_female<-data.frame(region=us_pop_agg$region,age.range=us_pop_agg$age.range,male=F,pop=us_pop_agg$female)
pop_tot<-rbind(us_pop_agg_female, us_pop_agg_male)
pop_tot$prop<-pop_tot$pop/sum(pop_tot$pop)

us1$male<-us1$Q22==1
pop_fb<-with(us1[us1$sample=='Facebook',],aggregate(list(num=region),list(region=region,age.range=age.range,male=male),length))
pop_fb$prop<-pop_fb$num/sum(pop_fb$num)
pop_fb$weight<-pop_tot$prop/pop_fb$prop
pop_fb$sample<-'Facebook'
pop_mt<-with(us1[us1$sample=='MTurk',],aggregate(list(num=region),list(region=region,age.range=age.range,male=male),length))
pop_mt$prop<-pop_mt$num/sum(pop_mt$num)
pop_mt$weight<-pop_tot$prop/pop_mt$prop
pop_mt$sample<-'MTurk'
pop_qt<-with(us1[us1$sample=='Qualtrics',],aggregate(list(num=region),list(region=region,age.range=age.range,male=male),length))
pop_qt$prop<-pop_qt$num/sum(pop_qt$num)
pop_qt$weight<-pop_tot$prop/pop_qt$prop
pop_qt$sample<-'Qualtrics'

weights<-rbind(pop_fb, pop_mt, pop_qt)

# Merge weights into data frame by sample.

us1<-merge(us1,weights[,c('region','age.range','male','sample','weight')])

# Remove redundant "male" variable and reorder to match us.RData, then save.
us1$male<-NULL
us1<-us1[,c(varnames.us, setdiff(names(us1),varnames.us))]

save(us1, file='us_completions_augmented.RData')